In [1]:
#### Importing Libraries ####

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
In [2]:
dataset = pd.read_csv('financial_data.csv')
In [3]:
dataset.head()
Out[3]:
entry_id age pay_schedule home_owner income months_employed years_employed current_address_year personal_account_m personal_account_y ... amount_requested risk_score risk_score_2 risk_score_3 risk_score_4 risk_score_5 ext_quality_score ext_quality_score_2 inquiries_last_month e_signed
0 7629673 40 bi-weekly 1 3135 0 3 3 6 2 ... 550 36200 0.737398 0.903517 0.487712 0.515977 0.580918 0.380918 10 1
1 3560428 61 weekly 0 3180 0 6 3 2 7 ... 600 30150 0.738510 0.881027 0.713423 0.826402 0.730720 0.630720 9 0
2 6934997 23 weekly 0 1540 6 0 0 7 1 ... 450 34550 0.642993 0.766554 0.595018 0.762284 0.531712 0.531712 7 0
3 5682812 40 bi-weekly 0 5230 0 6 1 2 7 ... 700 42150 0.665224 0.960832 0.767828 0.778831 0.792552 0.592552 8 1
4 5335819 33 semi-monthly 0 3590 0 5 2 2 8 ... 1100 53850 0.617361 0.857560 0.613487 0.665523 0.744634 0.744634 12 0

5 rows × 21 columns

In [4]:
dataset.columns
Out[4]:
Index(['entry_id', 'age', 'pay_schedule', 'home_owner', 'income',
       'months_employed', 'years_employed', 'current_address_year',
       'personal_account_m', 'personal_account_y', 'has_debt',
       'amount_requested', 'risk_score', 'risk_score_2', 'risk_score_3',
       'risk_score_4', 'risk_score_5', 'ext_quality_score',
       'ext_quality_score_2', 'inquiries_last_month', 'e_signed'],
      dtype='object')
In [5]:
dataset.describe()
Out[5]:
entry_id age home_owner income months_employed years_employed current_address_year personal_account_m personal_account_y has_debt amount_requested risk_score risk_score_2 risk_score_3 risk_score_4 risk_score_5 ext_quality_score ext_quality_score_2 inquiries_last_month e_signed
count 1.790800e+04 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000 17908.000000
mean 5.596978e+06 43.015412 0.425173 3657.214653 1.186006 3.526860 3.584711 3.427183 3.503350 0.795399 950.446449 61086.302211 0.690878 0.878276 0.583155 0.718252 0.623112 0.622068 6.457226 0.538251
std 2.562473e+06 11.873107 0.494383 1504.890063 2.400897 2.259732 2.751937 2.216440 1.955568 0.403421 698.543683 15394.255020 0.090470 0.054563 0.125061 0.120697 0.139729 0.139898 3.673093 0.498549
min 1.111398e+06 18.000000 0.000000 905.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 350.000000 2100.000000 0.023258 0.451371 0.016724 0.153367 0.010184 0.006622 1.000000 0.000000
25% 3.378999e+06 34.000000 0.000000 2580.000000 0.000000 2.000000 2.000000 2.000000 2.000000 1.000000 600.000000 49350.000000 0.640993 0.850882 0.500208 0.633708 0.521735 0.519677 4.000000 0.000000
50% 5.608376e+06 42.000000 0.000000 3260.000000 0.000000 3.000000 3.000000 2.000000 3.000000 1.000000 700.000000 61200.000000 0.699561 0.881004 0.588208 0.725113 0.625944 0.622974 6.000000 1.000000
75% 7.805624e+06 51.000000 1.000000 4670.000000 1.000000 5.000000 5.000000 5.000000 4.000000 1.000000 1100.000000 72750.000000 0.752887 0.912608 0.672395 0.806681 0.729841 0.728940 8.000000 1.000000
max 9.999874e+06 96.000000 1.000000 9985.000000 11.000000 16.000000 12.000000 11.000000 15.000000 1.000000 10200.000000 99750.000000 0.999997 0.999024 0.978932 0.996260 0.970249 0.966953 30.000000 1.000000
In [6]:
dataset.isnull().any()
Out[6]:
entry_id                False
age                     False
pay_schedule            False
home_owner              False
income                  False
months_employed         False
years_employed          False
current_address_year    False
personal_account_m      False
personal_account_y      False
has_debt                False
amount_requested        False
risk_score              False
risk_score_2            False
risk_score_3            False
risk_score_4            False
risk_score_5            False
ext_quality_score       False
ext_quality_score_2     False
inquiries_last_month    False
e_signed                False
dtype: bool
In [7]:
dataset2 = dataset.drop(['entry_id', 'pay_schedule', 'e_signed'], axis = 1)
In [9]:
# paiwise scatter plot

plt.figure(figsize=(20, 10))
sn.pairplot(dataset2)
plt.show()
<Figure size 1440x720 with 0 Axes>
In [43]:
sn.distplot(dataset['income'])
plt.show()
In [44]:
sn.distplot(dataset['amount_requested'])
plt.show()
In [45]:
sn.distplot(dataset['risk_score'])
plt.show()
In [46]:
sn.distplot(dataset['inquiries_last_month'])
plt.show()
In [10]:
dataset2.corrwith(dataset.e_signed).plot.bar(figsize = (18, 15),
                  title= 'Correlation with Esigned', fontsize = 15,
                  rot = 45, grid = True)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x253ec4b9e48>
In [11]:
## Correlation Matrix
sn.set(style="white")

# Compute the correlation matrix
corr = dataset2.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(22, 20))

# Generate a custom diverging colormap
cmap = sn.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sn.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x253e2573eb8>
In [12]:
# Feature Engineering

dataset = dataset.drop(['months_employed'], axis = 1)
In [13]:
dataset['personal_account_months'] = (dataset.personal_account_m + (dataset.personal_account_y * 12))
In [14]:
dataset[['personal_account_m', 'personal_account_y', 'personal_account_months']].head()
Out[14]:
personal_account_m personal_account_y personal_account_months
0 6 2 30
1 2 7 86
2 7 1 19
3 2 7 86
4 2 8 98
In [15]:
dataset = dataset.drop(['personal_account_m', 'personal_account_y'],axis = 1)
In [16]:
# One Hot Encoding
dataset = pd.get_dummies(dataset)
In [17]:
dataset.columns
Out[17]:
Index(['entry_id', 'age', 'home_owner', 'income', 'years_employed',
       'current_address_year', 'has_debt', 'amount_requested', 'risk_score',
       'risk_score_2', 'risk_score_3', 'risk_score_4', 'risk_score_5',
       'ext_quality_score', 'ext_quality_score_2', 'inquiries_last_month',
       'e_signed', 'personal_account_months', 'pay_schedule_bi-weekly',
       'pay_schedule_monthly', 'pay_schedule_semi-monthly',
       'pay_schedule_weekly'],
      dtype='object')
In [18]:
dataset = dataset.drop(['pay_schedule_semi-monthly'], axis = 1)
In [19]:
# Removing extra columns
response = dataset["e_signed"]
users = dataset['entry_id']
dataset = dataset.drop(["e_signed", "entry_id"], axis =1)
In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, response, test_size = 0.2, random_state = 0)
In [22]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train2 = pd.DataFrame(sc_X.fit_transform(X_train))
X_test2 = pd.DataFrame(sc_X.fit_transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test.index = X_test.index.values
X_train = X_train2
X_test = X_test2
In [23]:
#### Model Building ####

### Comparing Models

## Logistic Regression
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, penalty = 'l1')
classifier.fit(X_train, y_train)
C:\Users\TariqueAkhtar\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
Out[23]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
In [24]:
# Predicting Test Set
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
In [25]:
results = pd.DataFrame([['LogisticRegression acc', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
print(results)
In [26]:
## SVM (Linear)
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel = 'linear')
classifier.fit(X_train, y_train)
Out[26]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)
In [27]:
# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
In [28]:
model_results = pd.DataFrame([['SVM (Linear)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
In [29]:
results = results.append(model_results, ignore_index = True)
In [30]:
## SVM (rbf)
from sklearn.svm import SVC
classifier = SVC(random_state = 0, kernel = 'rbf')
classifier.fit(X_train, y_train)
Out[30]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=0,
    shrinking=True, tol=0.001, verbose=False)
In [31]:
# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
In [32]:
model_results = pd.DataFrame([['SVM (RBF)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
In [33]:
results = results.append(model_results, ignore_index = True)
In [34]:
## random forest classifier
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(random_state = 0, n_estimators = 100,
                                    criterion = 'entropy')
classifier.fit(X_train, y_train)
Out[34]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)
In [35]:
# Predicting Test Set
y_pred = classifier.predict(X_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
In [36]:
model_results = pd.DataFrame([['Random Forest (n=100)', acc, prec, rec, f1]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
In [37]:
results = results.append(model_results, ignore_index = True)
In [39]:
## K-fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X= X_train, y = y_train,
                             cv = 10)
In [40]:
print("Random Forest Classifier Accuracy: %0.2f (+/- %0.2f)"  % (accuracies.mean(), accuracies.std() * 2))
Random Forest Classifier Accuracy: 0.63 (+/- 0.03)
In [49]:
results = results.append(model_results, ignore_index = True)
In [51]:
results
Out[51]:
Model Accuracy Precision Recall F1 Score
0 Linear Regression (Lasso) 0.563372 0.577778 0.701245 0.633552
1 SVM (Linear) 0.567839 0.578189 0.728734 0.644791
2 SVM (RBF) 0.592686 0.607519 0.687241 0.644926
3 Random Forest (n=100) 0.623953 0.643741 0.674793 0.658901
4 Random Forest (n=100) 0.623953 0.643741 0.674793 0.658901
In [ ]: